Word2Vec using Noise Contrastive Estimation and vizualization in t-SNE


In [18]:
import os
from random import randint
from collections import Counter
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import zipfile

import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

In [28]:
# Parameters for downloading data
DOWNLOAD_URL = 'http://mattmahoney.net/dc/'
EXPECTED_BYTES = 31344016
DATA_FOLDER = './data/'
FILE_NAME = 'text8.zip'

In [29]:
def download(file_name, expected_bytes):
    """ Download the dataset text8 if it's not already downloaded """
    file_path = DATA_FOLDER + file_name
    if os.path.exists(file_path):
        print("Dataset ready")
        return file_path
    file_name, _ = urllib.request.urlretrieve(DOWNLOAD_URL + file_name, file_path)
    file_stat = os.stat(file_path)
    if file_stat.st_size == expected_bytes:
        print('Successfully downloaded the file', file_name)
    else:
        raise Exception('File ' + file_name +
                        ' might be corrupted. You should try downloading it with a browser.')
    return file_path

In [30]:
def read_data(file_path):
    """ Read data into a list of tokens 
    There should be 17,005,207 tokens
    """
    with zipfile.ZipFile(file_path) as f:
        words = tf.compat.as_str(f.read(f.namelist()[0])).split() 
        # tf.compat.as_str() converts the input into the string
    return words

In [20]:
# corpus = 'the quick brown fox jumped over the lazy dog'
corpus = read_data(DATA_FOLDER + FILE_NAME)
corpus[:10]


Out[20]:
['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [21]:
def build_vocab(words, vocab_size):
    """ Build vocabulary of VOCAB_SIZE most frequent words """
    dictionary = dict()
    count = [('UNK', -1)]
    count.extend(Counter(words).most_common(vocab_size - 1))
    index = 0
    with open('vocabulary/vocab_1000.tsv', "w") as f:
        for word, _ in count:
            dictionary[word] = index
            if index < 1000:
                f.write(word + "\n")
            index += 1
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, index_dictionary

In [22]:
vocabulary, reverse_vocabulary = build_vocab(corpus, 5000)

In [23]:
len(vocabulary)


Out[23]:
5000

In [6]:
def index_words_in_corpus(corpus):
    return [vocabulary[token] if token in vocabulary else 0 for token in corpus]

In [24]:
corpus = index_words_in_corpus(corpus)

In [25]:
def generate_sample(index_words, context_window_size):
    """ Form training pairs according to the skip-gram model. """
    for index, center in enumerate(index_words):
        context = randint(1, context_window_size)
        # get a random target before the center word
        for target in index_words[max(0, index - context): index]:
            yield center, target
        # get a random target after the center wrod
        for target in index_words[index + 1: index + context + 1]:
            yield center, target

In [26]:
# [_ for _ in generate_sample(corpus, 2)]

In [27]:
def get_batch(iterator, batch_size):
    """ Group a numerical stream into batches and yield them as Numpy arrays. """
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1], dtype=np.int32)
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)
        yield center_batch, target_batch

In [11]:
# [_ for _ in get_batch(generate_sample(corpus, 1), 2)]

In [12]:
VOCAB_SIZE = 5000
BATCH_SIZE = 32
EMBED_SIZE = 10 # dimension of the word embedding vectors
SKIP_WINDOW = 3 # the context window
NUM_SAMPLED = 16    # Number of negative examples to sample.
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 10000
SKIP_STEP = 100 # how many steps to skip before reporting the loss

In [13]:
class SkipGramModel:
    """ Build the graph for word2vec model """
    def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
        self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')

    def _create_placeholders(self):
        """ Step 1: define the placeholders for input and output """
        with tf.name_scope("data"):
            self.center_words = tf.placeholder(tf.int32, shape=[self.batch_size], name='center_words')
            self.target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1], name='target_words')

    def _create_embedding(self):
        """ Step 2: define weights. In word2vec, it's actually the weights that we care about """
        with tf.name_scope("embed"):
            self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size, 
                                                                self.embed_size], -1.0, 1.0), 
                                                                name='embed_matrix')

    def _create_loss(self):
        """ Step 3 + 4: define the model + the loss function """
        with tf.name_scope("loss"):
            # Step 3: define the inference
            embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, name='embed')

            # Step 4: define loss function
            # construct variables for NCE loss
            nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size],
                                                        stddev=1.0 / (self.embed_size ** 0.5)), 
                                                        name='nce_weight')
            nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]), name='nce_bias')

            # define loss function to be NCE loss function
            self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, 
                                                biases=nce_bias, 
                                                labels=self.target_words, 
                                                inputs=embed, 
                                                num_sampled=self.num_sampled, 
                                                num_classes=self.vocab_size), name='loss')

    def _create_optimizer(self):
        """ Step 5: define optimizer """
        self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss, global_step=self.global_step)

    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar("loss", self.loss)
            tf.summary.histogram("histogram_loss", self.loss)
            # because you have several summaries, we should merge them all
            # into one op to make it easier to manage
            self.summary_op = tf.summary.merge_all()

    def build_graph(self):
        """ Build the graph for our model """
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()

def train_model(model, batch_gen, num_train_steps):
    saver = tf.train.Saver() # defaults to saving all variables - in this case embed_matrix, nce_weight, nce_bias

    initial_step = 0
#     utils.make_dir('checkpoints')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('./tf_checkpoints/checkpoint'))
        # if that checkpoint exists, restore from checkpoint
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

        total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
        writer = tf.summary.FileWriter('./tf_graphs/lr' + str(LEARNING_RATE), sess.graph)
        initial_step = model.global_step.eval()
        for index in range(initial_step, initial_step + num_train_steps):
            centers, targets = next(batch_gen)
            feed_dict={model.center_words: centers, model.target_words: targets}
            loss_batch, _, summary = sess.run([model.loss, model.optimizer, model.summary_op], 
                                              feed_dict=feed_dict)
            writer.add_summary(summary, global_step=index)
            total_loss += loss_batch
            if (index + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                total_loss = 0.0
                saver.save(sess, './tf_checkpoints/skip-gram', index)
        
        ####################
        # code to visualize the embeddings. uncomment the below to visualize embeddings
        # run "'tensorboard --logdir='processed'" to see the embeddings
        final_embed_matrix = sess.run(model.embed_matrix)
        
        # it has to variable. constants don't work here. you can't reuse model.embed_matrix
        embedding_var = tf.Variable(final_embed_matrix[:1000], name='embedding')
        sess.run(embedding_var.initializer)

        config = projector.ProjectorConfig()
        summary_writer = tf.summary.FileWriter('./processed')

        # add embedding to the config file
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name
        
        # link this tensor to its metadata file, in this case the first 500 words of vocab
        embedding.metadata_path = './vocabulary/vocab_1000.tsv'

        # saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(summary_writer, config)
        saver_embed = tf.train.Saver([embedding_var])
        saver_embed.save(sess, './processed/model3.ckpt', 1)

In [14]:
model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
model.build_graph()
batch_generator = get_batch(generate_sample(corpus, SKIP_WINDOW), BATCH_SIZE)

In [15]:
train_model(model, batch_generator, NUM_TRAIN_STEPS)


Average loss at step 99:  52.0
Average loss at step 199:  39.2
Average loss at step 299:  33.6
Average loss at step 399:  26.9
Average loss at step 499:  27.0
Average loss at step 599:  22.8
Average loss at step 699:  21.7
Average loss at step 799:  20.5
Average loss at step 899:  19.6
Average loss at step 999:  20.0
Average loss at step 1099:  16.9
Average loss at step 1199:  16.4
Average loss at step 1299:  15.1
Average loss at step 1399:  14.3
Average loss at step 1499:  13.2
Average loss at step 1599:  13.7
Average loss at step 1699:  14.1
Average loss at step 1799:  11.8
Average loss at step 1899:  11.0
Average loss at step 1999:  11.7
Average loss at step 2099:  10.2
Average loss at step 2199:  12.0
Average loss at step 2299:  11.1
Average loss at step 2399:  12.3
Average loss at step 2499:  10.1
Average loss at step 2599:   9.4
Average loss at step 2699:   8.7
Average loss at step 2799:   7.9
Average loss at step 2899:   8.6
Average loss at step 2999:   9.2
Average loss at step 3099:   7.8
Average loss at step 3199:   7.4
Average loss at step 3299:   7.0
Average loss at step 3399:   7.2
Average loss at step 3499:   6.7
Average loss at step 3599:   7.7
Average loss at step 3699:   6.3
Average loss at step 3799:   7.3
Average loss at step 3899:   6.5
Average loss at step 3999:   7.3
Average loss at step 4099:   6.4
Average loss at step 4199:   6.2
Average loss at step 4299:   6.4
Average loss at step 4399:   6.0
Average loss at step 4499:   5.7
Average loss at step 4599:   6.2
Average loss at step 4699:   5.6
Average loss at step 4799:   5.9
Average loss at step 4899:   6.3
Average loss at step 4999:   6.3
Average loss at step 5099:   5.5
Average loss at step 5199:   6.0
Average loss at step 5299:   5.7
Average loss at step 5399:   5.5
Average loss at step 5499:   5.2
Average loss at step 5599:   4.5
Average loss at step 5699:   5.0
Average loss at step 5799:   5.2
Average loss at step 5899:   5.2
Average loss at step 5999:   5.5
Average loss at step 6099:   5.7
Average loss at step 6199:   5.3
Average loss at step 6299:   5.4
Average loss at step 6399:   5.0
Average loss at step 6499:   5.6
Average loss at step 6599:   4.6
Average loss at step 6699:   5.7
Average loss at step 6799:   5.0
Average loss at step 6899:   5.1
Average loss at step 6999:   4.9
Average loss at step 7099:   4.6
Average loss at step 7199:   4.5
Average loss at step 7299:   4.5
Average loss at step 7399:   4.5
Average loss at step 7499:   4.5
Average loss at step 7599:   4.8
Average loss at step 7699:   4.7
Average loss at step 7799:   4.3
Average loss at step 7899:   4.6
Average loss at step 7999:   4.3
Average loss at step 8099:   4.4
Average loss at step 8199:   4.2
Average loss at step 8299:   4.2
Average loss at step 8399:   4.6
Average loss at step 8499:   4.4
Average loss at step 8599:   4.5
Average loss at step 8699:   4.3
Average loss at step 8799:   4.4
Average loss at step 8899:   4.3
Average loss at step 8999:   4.6
Average loss at step 9099:   4.1
Average loss at step 9199:   4.2
Average loss at step 9299:   4.3
Average loss at step 9399:   5.0
Average loss at step 9499:   4.1
Average loss at step 9599:   4.3
Average loss at step 9699:   4.0
Average loss at step 9799:   4.5
Average loss at step 9899:   4.6
Average loss at step 9999:   4.6

In [ ]: